{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n",
"
\n",
" \n",
" \n",
" \n",
" Try in Google Colab\n",
" \n",
" | \n",
" \n",
" \n",
" \n",
" Share via nbviewer\n",
" \n",
" | \n",
" \n",
" \n",
" \n",
" View on GitHub\n",
" \n",
" | \n",
" \n",
" \n",
" \n",
" Download notebook\n",
" \n",
" | \n",
"
\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# **Video Labels in FiftyOne**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **A guided example with ASL videos**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First download and unzip the dataset.\n",
"\n",
"We will be using the [WLASL Dataset](https://www.kaggle.com/datasets/risangbaskoro/wlasl-processed), a dataset comprised of actors performing sign language."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install kaggle"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/dan/.kaggle/kaggle.json'\n",
"Downloading wlasl-processed.zip to /home/dan/Documents/tnt\n",
"100%|█████████████████████████████████████▉| 4.82G/4.82G [02:14<00:00, 54.0MB/s]\n",
"100%|██████████████████████████████████████| 4.82G/4.82G [02:14<00:00, 38.4MB/s]\n"
]
}
],
"source": [
"!kaggle datasets download -d risangbaskoro/wlasl-processed"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"!mkdir wlasl-processed"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!unzip wlasl-processed.zip -d wlasl-processed"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np # linear algebra\n",
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
"import json\n",
"import os"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load in the labels for the dataset"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"main_path = './wlasl-processed/'\n",
"wlasl_df = pd.read_json(main_path + 'WLASL_v0.3.json')"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" gloss | \n",
" instances | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" book | \n",
" [{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra... | \n",
"
\n",
" \n",
" 1 | \n",
" drink | \n",
" [{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f... | \n",
"
\n",
" \n",
" 2 | \n",
" computer | \n",
" [{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_... | \n",
"
\n",
" \n",
" 3 | \n",
" before | \n",
" [{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_... | \n",
"
\n",
" \n",
" 4 | \n",
" chair | \n",
" [{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" gloss instances\n",
"0 book [{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra...\n",
"1 drink [{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f...\n",
"2 computer [{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...\n",
"3 before [{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...\n",
"4 chair [{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."
]
},
"execution_count": 159,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"wlasl_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [],
"source": [
"mp4_dir = main_path + \"videos\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **Creating the Video Dataset**"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 100% |█████████████| 11980/11980 [1.7s elapsed, 0s remaining, 7.2K samples/s] \n",
"Computing metadata...\n",
" 100% |█████████████| 11980/11980 [1.2m elapsed, 0s remaining, 403.2 samples/s] \n"
]
}
],
"source": [
"import fiftyone\n",
"\n",
"dataset = fo.Dataset.from_dir(dataset_dir=mp4_dir,dataset_type=fo.types.VideoDirectory)\n",
"dataset.ensure_frames()\n",
"dataset.compute_metadata()\n",
"dataset.name = 'wlasl-dataset'\n",
"dataset.persistent = True"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
"\n",
"
\n",
" \n",
"
\n",
"

\n",
"
\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\r\n",
"Could not connect session, trying again in 10 seconds\r\n",
"\n"
]
}
],
"source": [
"session = fo.launch_app(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **Sample Detections**"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [],
"source": [
"def find_row_by_video_id(dataframe, video_id):\n",
" for index, row in dataframe.iterrows():\n",
" for instance in row['instances']:\n",
" if instance['video_id'] == video_id:\n",
" return row, instance\n",
" return None "
]
},
{
"cell_type": "code",
"execution_count": 164,
"metadata": {},
"outputs": [],
"source": [
"view = dataset.take(100)"
]
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"for sample in view:\n",
" base_file_name = os.path.basename(sample.filepath)\n",
" video_id, extension = os.path.splitext(base_file_name)\n",
" row, inst = find_row_by_video_id(wlasl_df,video_id)\n",
" gloss = row[\"gloss\"]\n",
" bbox = inst[\"bbox\"]\n",
" imw = sample.metadata.frame_width\n",
" imh = sample.metadata.frame_height\n",
" x1 = bbox[0] / imw\n",
" x2 = bbox[2] / imw\n",
" y1 = bbox[1] / imh\n",
" y2 = bbox[3] / imh\n",
" bbox = [x1,y1,x2-x1,y2-y1]\n",
" det = fo.Detection(bounding_box=bbox,label=gloss)\n",
" sample['Sample Label'] = fo.Detections(detections=[det])\n",
" \n",
" sample.save()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
"\n",
"
\n",
" \n",
"
\n",
"

\n",
"
\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"session.view = view"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **Frame Level Detections**"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [],
"source": [
"def bigger_bbox(x, y, width, height, index):\n",
"\n",
" offset = 0.001\n",
" x_offset = index*offset\n",
"\n",
" # Apply the offsets to the parameters\n",
" n_x = x - x_offset\n",
" n_width = width + x_offset*2 \n",
"\n",
" return [n_x, y, n_width, height]"
]
},
{
"cell_type": "code",
"execution_count": 168,
"metadata": {},
"outputs": [],
"source": [
"for sample in view:\n",
" base_file_name = os.path.basename(sample.filepath)\n",
" video_id, extension = os.path.splitext(base_file_name)\n",
" row, inst = find_row_by_video_id(wlasl_df,video_id)\n",
" gloss = row[\"gloss\"]\n",
" bbox = inst[\"bbox\"]\n",
" imw = sample.metadata.frame_width\n",
" imh = sample.metadata.frame_height\n",
" x1 = bbox[0] / imw\n",
" x2 = bbox[2] / imw\n",
" y1 = bbox[1] / imh\n",
" y2 = bbox[3] / imh\n",
" bbox = [x1,y1,x2-x1,y2-y1]\n",
" for frame_no, frame in sample.frames.items():\n",
" new_bbox = bigger_bbox(bbox[0],bbox[1],bbox[2],bbox[3],frame_no)\n",
" det = fo.Detection(bounding_box=new_bbox,label=gloss)\n",
" frame['Frame Label'] = fo.Detections(detections=[det])\n",
" \n",
" sample.save()"
]
},
{
"cell_type": "code",
"execution_count": 169,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
"\n",
"
\n",
" \n",
"
\n",
"

\n",
"
\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"session.view = view"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **Temporal Detections**"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [],
"source": [
"for sample in view:\n",
" base_file_name = os.path.basename(sample.filepath)\n",
" video_id, extension = os.path.splitext(base_file_name)\n",
" row, inst = find_row_by_video_id(wlasl_df,video_id)\n",
" gloss = row[\"gloss\"]\n",
" sample[\"TD Word\"] = fo.TemporalDetection.from_timestamps(\n",
" [0, sample.metadata.duration/2], label=gloss, sample=sample\n",
" )\n",
" sample[\"TD Word2\"] = fo.TemporalDetection.from_timestamps(\n",
" [sample.metadata.duration/2, sample.metadata.duration], label=\"ASL is awesome!\", sample=sample\n",
" )\n",
"\n",
" \n",
" sample.save()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"\n",
"\n",
"
\n",
" \n",
"
\n",
"

\n",
"
\n",
"\n",
""
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"session.view = view"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **Video Classification**"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [],
"source": [
"for sample in view:\n",
" base_file_name = os.path.basename(sample.filepath)\n",
" video_id, extension = os.path.splitext(base_file_name)\n",
" row, inst = find_row_by_video_id(wlasl_df,video_id)\n",
" gloss = row[\"gloss\"]\n",
" sample[\"class\"] = fo.Classification(label=gloss)\n",
" \n",
" sample.save()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 173,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"session.view = view"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
""
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}